// #include "hnumslucmn.h"

#pragma once
#ifndef __HNUMSLUCMN_H__
#define __HNUMSLUCMN_H__

#include "hnum_slu_mt_machines.h"

/*
 * File name:           pssp_defs.h
 * Purpose:             Sparse matrix types and function prototypes
 * History:
 */

/****************************
  Include thread header file
  ***************************/
#if defined ( _SOLARIS )
#include <thread.h>
#include <sched.h>
#elif defined( _DEC )
#include <pthread.h>
#include <unistd.h>
#include <sys/mman.h>
#elif defined ( _OPENMP )
#include <omp.h>
#elif defined ( _PTHREAD )
#include <pthread.h>
#elif defined ( _CRAY )
#include <fortran.h>
#include <string.h>
#endif

/* Define my integer type int_t */
typedef int int_t; /* default */

#include "hnum_slu_mt_Cnames.h"
#include "hnum_supermatrix.h"
#include "hnum_slu_mt_util.h"
#include "hnum_pxgstrf_synch.h"
#include "hnum_cblas.h"



namespace harlinn
{
    namespace numerics
    {
        namespace SuperLU
        {
               /*
                * *************************************************
                *  Global data structures used in LU factorization
                * *************************************************
                * 
                *   nsuper: number of supernodes = nsuper+1, numbered between 0 and nsuper.
                *
                *   (supno, xsup, xsup_end):
                *      supno[i] is the supernode number to which column i belongs;
                *	xsup[s] points to the first column of supernode s;
                *      xsup_end[s] points to one past the last column of supernode s.
                *	Example: supno  0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
                *	          xsup  0 1 2 4 7
                *            xsup_end  1 2 4 7 12
                *	Note: dfs will be performed on supernode rep. relative to the new 
                *	      row pivoting ordering
                *
                *   (lsub, xlsub, xlsub_end):
                *      lsub[*] contains the compressed subscripts of the supernodes;
                *      xlsub[j] points to the starting location of the j-th column in
                *               lsub[*]; 
                *      xlsub_end[j] points to one past the ending location of the j-th
                *               column in lsub[*].
                *	Storage: original row subscripts in A.
                *
                *      During the course of sparse LU factorization, we also use
                *	(lsub, xlsub, xlsub_end, xprune) to represent symmetrically 
                *      pruned graph. Contention will occur when one processor is
                *      performing DFS on supernode S, while another processor is pruning
                *      supernode S. We use the following data structure to deal with
                *      this problem. Suppose each supernode contains columns {s,s+1,...,t},
                *      with first column s and last column t.
                *
                *      (1) if t > s, only the subscript sets for column s and column t
                *          are stored. Column t represents pruned adjacency structure.
                *
                *                  --------------------------------------------
                *          lsub[*]    ... |   col s    |   col t   | ...
                *                  --------------------------------------------
                *                          ^            ^           ^
                *                       xlsub[s]    xlsub_end[s]  xlsub_end[s+1]
                *                                   xlsub[s+1]      :
                *                                       :           :
                *                                       :         xlsub_end[t]
                *                                   xlsub[t]      xprune[t] 
                *                                   xprune[s]    
                *
                *      (2) if t == s, i.e., a singleton supernode, the subscript set
                *          is stored twice:
                *
                *                  --------------------------------------
                *          lsub[*]    ... |      s     |     s     | ...
                *                  --------------------------------------
                *                          ^            ^           ^
                *                       xlsub[s]   xlsub_end[s]  xprune[s]
                *
                *      There are two subscript sets for each supernode, the last column
                *      structures (for pruning) will be removed after the numerical LU
                *      factorization phase:
                *        o lsub[j], j = xlsub[s], ..., xlsub_end[s]-1
                *          is the structure of column s (i.e. structure of this supernode).
                *          It is used for the storage of numerical values.
                *	  o lsub[j], j = xlsub[t], ..., xlsub_end[t]-1
                *	    is the structure of the last column t of this supernode.
                *	    It is for the purpose of symmetric pruning. Therefore, the
                *	    structural subscripts can be rearranged without making physical
                *	    interchanges among the numerical values.
                *
                *       DFS will traverse the first subscript set if the supernode
                *       has not been pruned; otherwise it will traverse the second
                *       subscript list, i.e., the part of the pruned graph.
                *
                *   (lusup, xlusup, xlusup_end):
                *      lusup[*] contains the numerical values of the supernodes;
                *      xlusup[j] points to the starting location of the j-th column in
                *                storage vector lusup[*]; 
                *      xlusup_end[j] points to one past the ending location of the j-th 
                *                column in lusup[*].
                *	Each supernode is stored in column-major, consistent with Fortran
                *      two-dimensional array storage.
                *
                *   (ucol, usub, xusub, xusub_end):
                *      ucol[*] stores the numerical values of the U-columns above the
                *              supernodes. 
                *      usub[k] stores the row subscripts of nonzeros ucol[k];
                *      xusub[j] points to the starting location of column j in ucol/usub[]; 
                *      xusub_end[j] points to one past the ending location column j in
                *                   ucol/usub[].
                *	Storage: new row subscripts; that is indexed intp PA.
                *
                */
            struct GlobalLUBase_t
            {
                int     *xsup;    // supernode and column mapping 
                int     *xsup_end;
                int     *supno;   
                int     *lsub;    // compressed L subscripts 
                int	    *xlsub;
                int     *xlsub_end;
                
                int     *xlusup;
                int     *xlusup_end;
                
                int     *usub;
                int	    *xusub;
                int     *xusub_end;
                int     nsuper;   // current supernode number 
                int     nextl;    // next position in lsub[] 
                int     nextu;    // next position in usub[]/ucol[] 
                int     nextlu;   // next position in lusup[] 
                int     nzlmax;   // current max size of lsub[] 
                int     nzumax;   //    "    "    "      ucol[] 
                int     nzlumax;  //    "    "    "     lusup[] 
                // ---------------------------------------------------------------
                //  Memory managemant for L supernodes 
                //
                int  *map_in_sup;  // size n+1 - the address offset of each column
                                   // in lusup[*], which is divided into regions 
			            // by the supernodes of Householder matrix H.
			            // If column k starts a supernode in H,
			            // map_in_sup[k] is the next open position in
			            // lusup[*]; otherwise map_in_sup[k] gives the
			            // offset (negative) to the leading column
			            // of the supernode in H.
			            //
                int  dynamic_snode_bound;
                /* --------------------------------------------------------------- */

                ~GlobalLUBase_t()
                {
                    if(map_in_sup)
                    {
                        delete [] map_in_sup;
                    }
                }

            };

            template <typename T>
            struct GlobalLU_T : public GlobalLUBase_t
            {
                T *lusup;   // L supernodes
                T *ucol;    // U columns
            };


            /* 
                * *********************************************************************
                * The pxgstrf_shared_t structure contains the shared task queue and
                * the synchronization variables to facilitate parallel factorization. 
                * It also contains the shared L and U data structures.
                * *********************************************************************
                */
            struct pxgstrf_shared_Base_t 
            {
                // ----------------------------------------------------------------
                // Global variables introduced in parallel code for synchronization.
                //
                volatile int tasks_remain; // number of untaken panels 
                int          num_splits;   // number of panels split at the top 
                queue_t      taskq;        // size ncol - shared work queue 
                mutex_t      *lu_locks;    // 5 named mutual exclusive locks 
                volatile int *spin_locks;  // size ncol - mark every busy column 
                pan_status_t *pan_status;  // size ncol - panel status 
                int          *fb_cols;     // size ncol - mark farthest busy column 
                // ---------------------------------------------------------------- 
                int        *inv_perm_c;
                int        *inv_perm_r;
                int        *xprune;
                int        *ispruned;
                SuperMatrix *A;
                
                Gstat_t    *Gstat;
                int        *info;
            };

            template <typename T>
            struct pxgstrf_shared_T : public pxgstrf_shared_Base_t
            {
                T *Glu;
            };


            /* Arguments passed to each thread. */
            struct pxgstrf_threadarg_t 
            {
                int  pnum; /* process number */
                int  info; /* error code returned from each thread */
                superlumt_options_t *superlumt_options;
            };

            template <typename T>
            struct  pxgstrf_threadarg_T : public pxgstrf_threadarg_t 
            {
                T *pxgstrf_shared; /* shared for LU factorization */
            };

            int  *intMalloc (int);
            int  *intCalloc (int);

            void Destroy_SuperMatrix_Store(SuperMatrix *);
            void Destroy_CompCol_Matrix(SuperMatrix *);
            void Destroy_CompCol_Permuted(SuperMatrix *);
            void Destroy_CompCol_NCP(SuperMatrix *);
            void Destroy_SuperNode_Matrix(SuperMatrix *);
            void Destroy_SuperNode_SCP(SuperMatrix *);

            void pxgstrf_finalize(superlumt_options_t *, SuperMatrix *);

            void pxgstrf_relax_snode( const int n, superlumt_options_t *superlumt_options, pxgstrf_relax_t *pxgstrf_relax );

            

            void pxgstrf_super_bnd_dfs(
		      const int  pnum, /* process number */
		      const int  m,    /* number of rows in the matrix */
		      const int  n,    /* number of columns in the matrix */
		      const int  jcol, /* first column of the H-supernode */
		      const int  w,    /* size of the H-supernode */
		      SuperMatrix *A,  /* original matrix */
		      int        *perm_r,   /* in */
		      int        *iperm_r,  /* in; inverse of perm_r */
		      int        *xprune,   /* in */
		      int        *ispruned, /* in */
		      int        *marker,   /* modified */
		      int        *parent,   /* working array */
		      int        *xplore,   /* working array */
		      GlobalLUBase_t *Glu /* modified */
		      );

            void copy_mem_int(int howmany, void *old, void *new_);
            double dlangs(char *norm, SuperMatrix *A);
            float clangs(char *norm, SuperMatrix *A);
            float slangs(char *norm, SuperMatrix *A);
            double zlangs(char *norm, SuperMatrix *A);
            


            /*
             * Dynamically set up storage image in lusup[*], using the supernode
             * boundaries in H.
             */
            int DynamicSetMap(
	                      const int pnum,      /* process number */
	                      const int jcol,      /* leading column of the s-node in H */
	                      const int num,       /* number of elements requested */
	                      GlobalLUBase_t *Glu
	                      );

            double SuperLU_timer_();
            double usertimer_();

            int sp_ienv(int);
            double  slamch_(const char*);
            double  dlamch_(const char*);
            float slangs(char *, SuperMatrix *);
            

            
            void    superlu_abort_and_exit(char *);
            void *superlu_malloc (size_t);
            void superlu_free (void*);
            void PrintStat(Gstat_t *);
            int  ParallelProfile(const int, const int, const int,const int procs, Gstat_t *);

            void    ifill(int *, int, int);

            int  sp_coletree (int *, int *, int *, int, int, int *);
            void sp_colorder(SuperMatrix *A, int *perm_c, superlumt_options_t *options,SuperMatrix *AC);


            int  *TreePostorder (int, int *);

            int qrnzcnt(int neqns, int adjlen, int *xadj, int *adjncy, int *zfdperm,
	                        int *perm, int *invp, int *etpar, int *colcnt_h,
	                        int *nlnz, int *part_super_ata, int *part_super_h);


            //
            int await(volatile int *status);

            
            // Factorization related
            void pxgstrf_scheduler(const int pnum, const int n, const int *etree,  int *cur_pan, int *bcol, pxgstrf_shared_Base_t *pxgstrf_shared);

            int  ParallelFinalize (pxgstrf_shared_Base_t *pxgstrf_shared);
            int ParallelInit(int n, pxgstrf_relax_t *pxgstrf_relax, superlumt_options_t *superlumt_options, pxgstrf_shared_Base_t *pxgstrf_shared);
            int  queue_init (queue_t *, int);
            int  queue_destroy (queue_t *);
            int  EnqueueRelaxSnode (queue_t *, int, pxgstrf_relax_t *, pxgstrf_shared_Base_t *);
            int  EnqueueDomains(queue_t *, struct Branch *, pxgstrf_shared_Base_t *);
            int  Enqueue (queue_t *, qitem_t);
            int  Dequeue (queue_t *, qitem_t *);
            int  NewNsuper (const int, pxgstrf_shared_Base_t *, int *);

            void pxgstrf_resetrep_col(const int nseg, const int *segrep, int *repfnz);
            void countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLUBase_t *Glu);
            void fixupL(const int n, const int *perm_r, GlobalLUBase_t *Glu);


            void StatAlloc (const int, const int, const int, const int, Gstat_t*);
            void StatInit  (const int, const int, Gstat_t*);
            void StatFree  (Gstat_t*);

            //  Memory related 
            void pxgstrf_SetIWork (int, int, int *, int **, int **, int **,int **, int **, int **, int **);


        };
    };
};

#endif //__HNUMSLUCMN_H__
